/***************************************************************************
 *
 * Copyright (C) 2001 International Business Machines
 * All rights reserved.
 *
 * This file is part of the GPFS mmfslinux kernel module.
 *
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions 
 * are met:
 *
 *  1. Redistributions of source code must retain the above copyright notice, 
 *     this list of conditions and the following disclaimer. 
 *  2. Redistributions in binary form must reproduce the above copyright 
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution. 
 *  3. The name of the author may not be used to endorse or promote products 
 *     derived from this software without specific prior written
 *     permission. 
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *************************************************************************** */
/*
 * Linux implementation of basic common services
 *
 * Contents:
 *     cxiGetThreadId
 *     getpid
 *     cxiStackAddrToThreadId
 *     cxiIsSuperUser
 *     DoPanic
 *     logAssertFailed
 *   Kernel memory allocation services:
 *     cxiMallocPinned
 *     cxiFreePinned
 *
 * $Id: cxiSystem.c,v 1.54.2.5 2002/05/21 21:44:58 dcraft Exp $
 *
 * $Log: cxiSystem.c,v $
 * Revision 1.54.2.5  2002/05/21 21:44:58  dcraft
 * Pull GPFS 1.2.1 up to kernel 2.4.18.
 * mmfsfuncs.Linux must be distributed with /usr/lpp/mmfs/src
 * on developerworks.
 *
 * Revision 1.54.2.4  2001/11/16 06:01:01  mcnabb
 * Defect 353169 (NAS 8049):
 * Trace call was tracing data past the end of the mapped page.
 *
 * Revision 1.54.2.3  2001/11/08 17:25:33  mcnabb
 * Defect 353822:  The defensive code for kproc death is now too stern
 * given the real problem is fixed via 353738.
 *
 * Revision 1.54.2.2  2001/11/02 04:14:37  mcnabb
 * Defect 353194.  As a defensive measure don't leave the kernel
 * on a rmmod until the kproc is really gone.
 *
 * Revision 1.54.2.1  2001/10/31 17:09:08  mcnabb
 * For linux, logAssertFailed in the kernel should delay a few seconds
 * after lxtrace fsync is called to allow the lxtrace daemon to write
 * the records.  This is done to try to capture the last few records
 * since we haven't been able to recover them from NAS dumps.
 *
 * Revision 1.54  2001/10/24 19:54:09  dixonbp
 * Defect 351972 (NAS 7374): Don't chain temporary file struct to revoke
 * lock structures, improve checking of lock structs during cxiFcntlReset,
 * and set FL_POSIX flag in one spot where it was missing.
 *
 * Revision 1.53  2001/10/24 18:13:07  dcraft
 * Defect 352112: Be more systematic about kproc startup and end.  Let
 * keventd spawn the kernel threads in linux so we can signal him to
 * reap them.
 *
 * Revision 1.52  2001/10/11 14:04:17  gjertsen
 * Provide support for 64-bit mutex lockword for IA64 (LOCKWORD64).
 *
 * Revision 1.51  2001/10/03 14:46:11  dcraft
 * First attempt to bring us up to 2.4.9 and 2.4.10
 *
 * Revision 1.50  2001/10/02 14:17:38  dixonbp
 * Defect 350209 (NAS 6961): when scanning file_lock structures for
 * for advisory locks, check for null pointers in the file_lock, file,
 * and dentry structs.
 *
 * Revision 1.49  2001/09/25 19:02:35  gjertsen
 * Some IA64 code cleanup. Suppress unwanted compiler warnings.
 *
 * Revision 1.48  2001/09/24 19:37:48  jpalmer
 * Fix NAS defect 6663.  Replace unused l_vfs lock field in LINUX version 
 * with l_caller to distinguish LOCKD requested locks.
 *
 * Revision 1.47  2001/09/22 20:08:00  dcraft
 * Remove kiobufs from cxiKernelIODescriptor_t.  Use temporary
 * kiobufs for map/unmap.   Remove dead code and dead comments
 * in portability layer and update readmes and license.
 * Fix traceback to appear in mmfs.log file.
 *
 * Revision 1.46  2001/09/20 21:45:43  wyllie
 * Add call to determine if a cxiBlockingMutex_t has any threads waiting
 * for it
 *
 * Revision 1.45  2001/09/20 07:06:35  schmuck
 * Remove sleep_on_w_mutex and sleep_on_w_simple_lock.
 *
 * Revision 1.44  2001/09/19 19:56:57  gjertsen
 * Adjust cxiSleep calculation for large HZ values.
 *
 * Revision 1.43  2001/09/18 06:07:45  schmuck
 * Get rid of some compiler warnings.
 *
 * Revision 1.42  2001/09/14 15:44:18  dixonbp
 * Remove posix_locks_deadlock (no longer required for RH kernels).
 *
 * Revision 1.41  2001/09/12 23:54:36  eshel
 * Reanme cxi_schedule() to cxiYield()
 *
 * Revision 1.40  2001/09/12 22:20:28  eshel
 * add cxi_schedule() to call schedule()
 *
 * Revision 1.39  2001/09/06 13:59:44  dixonbp
 * Move lock owner from cxiFcntlLock args and add it to cxiFlock_t.
 * cxiFcntlReset will now release locks based on inode (not file).
 *
 * Revision 1.38  2001/08/29 00:20:34  jpalmer
 * Change content of OpenFile advLkObjP from a file pointer 
 * to an inode pointer.  The file pointer cannot be guaranteed 
 * to still be valid and this was causing failures in revoke when 
 * an out-of-date file pointer was used.
 *
 * Revision 1.37  2001/08/21 14:48:17  mroberts
 *
 * Rename cxiIsSambaThread to cxiIsSambaOrLockdThread.
 * Create cxiIsSambaThread that tests only for smbd and not lockd.
 *
 * Revision 1.36  2001/08/20 15:03:19  gjertsen
 * Avoid using hardwired i386 constants for calculating current.
 *
 * Revision 1.35  2001/08/10 14:42:45  dixonbp
 * Put reference to MAX_GPFS_LOCK_NAMES under ifdef INSTRUMENT_LOCKS.
 *
 * Revision 1.34  2001/08/10 00:41:18  wyllie
 * Use lock indices instead of lock numbers when calling cxiBlockingMutexInit,
 * to simplify statistics gathering.  Add call to reset or read statistics
 * for cxiBlockingButex_ts from the kernel.
 *
 * Revision 1.33  2001/08/09 21:11:17  dcraft
 * Modifications to allow running on latest Redhat 7.1 update
 * Kernel version 2.4.3-12.
 * Requires checkout of new site.mcr.proto
 *
 * Revision 1.32  2001/08/08 17:48:21  gjertsen
 * Need to move function out of ifdef.
 *
 * Revision 1.31  2001/08/06 23:36:01  wyllie
 * Count number of acquires and contention for cxiBlockingMutex_ts.
 * Add routine to determine if the current process holds the kernel lock.
 *
 * Revision 1.30  2001/08/02 22:40:25  dcraft
 * Don't attempt to avoid death when low paging space.  The code I put
 * in last week successfully gives us SIGTERM instead of SIGKILL, but the
 * selection criteria linux uses would still immediately select gpfs
 * again if we ignore SIGTERM.  There's no good solution for this on linux.
 *
 * Revision 1.29  2001/07/24 20:30:41  wyllie
 * Eliminate compiler warning
 *
 * Revision 1.28  2001/07/19 23:24:55  dcraft
 * Modified linux trace to allow non blocking trace record
 * writes (format is TRACE?N).  New gpfs swapd process created
 * which is responsible for reclaiming inodes (5 percent every
 * time it runs).  Marked all our inodes so that they would be
 * ignored by linux kswapd.  Added "unused" inode to inode
 * cache that could be used as a signal that linux kswapd is
 * running and kick off gpfs swapd.  Added means to ignore attempts
 * to kill mmfsd by kswapd if the system gets low on memory.
 * All done in an attempt to avoid kswapd premature wakeup on simple
 * locks and mutexes.
 *
 * Revision 1.27  2001/07/13 19:54:44  wyllie
 * Get PAGE_OFFSET by asking the mmfslinux_... kernel module rather than
 * compiling the constant into proprietary code.  Rename constants needed to
 * determine the GPFS memory map and export them to non-proprietary files.
 *
 * Revision 1.26  2001/07/13 05:30:24  wsawdon
 * Fix for Raleigh defect 3262 -- fs unmounted after delete snapshot.
 * Delete open NFS instances referring to a file in the snapshot.
 * Mark snapshot as invalid in the kernel's vfs mount table.
 *
 * Revision 1.25  2001/07/10 12:06:27  jpalmer
 * Add function to allow SMB Open and FCNTL tokens to be moved from one system to
 * another in response to the NAS load balancer moving the users.  Add the
 * external interface to provide a lock clamping function that will block new
 * locks during the time NFS is recovering its FCNTL and File locks.
 *
 * Revision 1.24  2001/07/06 17:08:46  gjertsen
 * Need an extra header include.
 *
 * Revision 1.23  2001/07/02 18:06:18  eshel
 * remove #define NFS_DEBUG (got in by mistake)
 *
 * Revision 1.22  2001/07/02 18:00:24  eshel
 * Initialize gpfsNode hash table based on (fileCacheLimit + statCacheLimit).
 *
 * Revision 1.21  2001/06/28 16:23:15  wyllie
 * Assert if trying to acquire a cxiBlockingMutex_t that is already held by
 * the caller, or if releasing one that is not held.
 *
 * Revision 1.20  2001/06/06 21:55:35  wyllie
 * Change the way NFSData objects are timed out.  Instead of using watchdog
 * timers that enqueue NFSData objects at interrupt level for later
 * processing by the NFS watchdog kernel process, just let the watchdog
 * process wake up periodically and examine its LRU list of NFSData objects
 * to decide which should be closed.  This fixes a bug on Linux that shows
 * up as the assert "nfsP->nextWatchP == NFS_WATCH_NOT_QUEUED" under high
 * load.  This also allows deleting a bunch of cxiXXX interfaces dealing
 * with watchdog timers.
 *
 * Revision 1.19  2001/05/25 14:48:17  gjertsen
 * Minor fixes to get IA64 code to compile again.
 *
 * Revision 1.18  2001/05/23 16:49:20  wyllie
 * Remove redundant macro definitions
 *
 * Revision 1.17  2001/05/15 18:23:44  dixonbp
 * cxiFcntlLock was not always initializing list_head structs
 *
 * Revision 1.16  2001/05/08 20:00:21  gjertsen
 * Account for name change with sync daemon in 2.4.2 kernel.
 *
 * Revision 1.15  2001/05/02 00:08:21  schmuck
 * Add ifdef to disable kernel panic when we hit an assert in the kernel
 * (compile with -DDISABLE_KERNEL_PANIC to disable cxiPanic).
 *
 * Revision 1.14  2001/04/30 23:29:51  dcraft
 * Defect 338328: kswapd is special and can be prematurely woken.
 *
 * Revision 1.13  2001/04/30 19:45:47  dixonbp
 * Add a comment.
 *
 * Revision 1.12  2001/04/27 19:48:33  wyllie
 * Do console print and traces early in logAssertFailed, so that the point
 * of the failure can more easily be found in the trace.  Add trace to
 * cxiSetMountInfo.
 *
 * Revision 1.11  2001/04/24 18:12:57  wyllie
 * Move handle of the trace file out of the shared segment.
 *
 * Revision 1.10  2001/04/23 23:08:29  dcraft
 * Fix disable_lock so it actually does what it says it does.
 * Perform FEXCL check before access check on create race condition.
 *
 * Revision 1.9  2001/04/23 22:18:19  jpalmer
 * Fix oplock hang by locating open file for ftruncInternal
 *
 * Revision 1.8  2001/04/23 18:08:16  eshel
 * Call DaemonToDie to get internal dump.
 *
 * Revision 1.7  2001/04/22 16:36:37  dcraft
 * Reimplement wait queue structure to have a chain of linux
 * wait queue heads and only one waiter per head.  This allows
 * us to control exactly which task will wake up.  Previously
 * the OS was free to select any task on the wait queue head.
 * This gave us incorrect semantics for "wakeup with result"
 * and resulted in crashes stating unexpected EINTR from wait.
 *
 * Revision 1.6  2001/04/17 19:58:40  dcraft
 * Defect 337521.  igrab() failed yet we still attempted an iput().
 *
 * Revision 1.5  2001/04/17 14:12:23  piucci
 * memcpy in cxiSetGroupListPID had incorrect number of bytes to copy
 *
 * Revision 1.4  2001/04/08 22:18:27  dcraft
 * Fix multinde delete race conditions.  Still incomplete.
 *
 * Revision 1.3  2001/04/06 21:00:15  gjertsen
 * Selectively screen out regular sync calls from kupdate.
 *
 * Revision 1.2  2001/04/05 18:16:58  schmuck
 * Fix compile errors in SMB_LOCKS code due to conversion to C.
 *
 * Revision 1.1  2001/04/05 13:30:58  gjertsen
 * Continue C++ to C conversion with manual C++2C utility.
 * Changes primarily for vfs stat stuff.
 *
 * Revision 1.95  2001/03/30 01:05:21  jpalmer
 * Add SMBopen lock functions for NLM interface
 *
 * Revision 1.94  2001/03/27 17:13:15  eshel
 * remove unused code
 *
 * Revision 1.93  2001/03/26 18:29:53  dcraft
 * Update inode attributes in OS node layer via callback to cxiSetOSNode
 * (previous setInode).  The attributes are now updated during kSFSGetattr()
 * while the lock is held to ensure validity.
 *
 * Revision 1.92  2001/03/21 15:00:21  dixonbp
 * In cxiGetNfsP, check for a valid vP before using it to retrieve nfsP.
 *
 * Revision 1.91  2001/03/16 23:35:28  jpalmer
 * Detect non-gpfs file in SMB open lock request
 *
 * Revision 1.90  2001/03/15 16:42:07  eshel
 * activate cxiPanic()
 *
 * Revision 1.89  2001/03/14 00:36:31  eshel
 * add cxiSetInode() to call setInode()
 *
 * Revision 1.88  2001/03/12 22:14:14  jpalmer
 * SMB Locks - add fput to balance fget of file descriptor
 *
 * Revision 1.87  2001/03/10 17:08:33  schmuck
 * cxiCloseNFS: don't call gpfsClose if the file was never succesfully opened.
 * Otherwise, if openNFS had failed for some reason, this was causing
 * null-pointer dereference in gpfsClose, or it caused asserts on mmap
 * counters going negative.
 *
 * Revision 1.86  2001/03/09 16:43:46  jpalmer
 * SMB Locking - remove old dfs Interops functions
 *
 * Revision 1.85  2001/03/07 20:05:35  jpalmer
 * SMB Open Lock function
 *
 * Revision 1.84  2001/03/07 00:50:53  eshel
 * remove unused code
 *
 * Revision 1.83  2001/03/06 04:58:25  manoj
 * Use Linux's find_task_by_pid() for kernels >= 2.4.0
 *
 * Revision 1.82  2001/03/05 23:28:07  dcraft
 * Modify inode and gpfsNode reference management.  Inode is now acquired
 * during gpfsNode creation and must be released via cxiPutOSNode().
 * (documented in gpfs database).  Add "mmfsadm dump vnodes" for producing
 * trace info on all held inodes.
 *
 * Revision 1.81  2001/03/05 17:40:48  eshel
 * add trace calls
 *
 * Revision 1.80  2001/03/02 22:45:39  jpalmer
 * SMB Locking function: user communications
 *
 * Revision 1.79  2001/03/01 19:59:20  dixonbp
 * Handle errors from ReleaseNFS and cxiRefOSNode
 *
 * Revision 1.78  2001/02/22 15:03:01  dixonbp
 * Temporary fix to cxiEventWakeup required since disable_lock doesn't work yet.
 *
 * Revision 1.77  2001/02/21 17:22:37  dixonbp
 * cxiEventSleep/Wakeup need to use disable_lock instead of simple_lock.
 *
 * Revision 1.76  2001/02/15 21:57:31  manoj
 * Support for NSS (NAS Security System). The filesystem stores NT-style ACLs and
 * uses NSS for access control and permission checking. All NSS code should be
 * #ifdef'ed by CONFIG_NSS.
 *
 * Revision 1.75  2001/02/12 18:29:45  dixonbp
 * cxiMallocPinned should check for (and fail) requests greater than
 * the maximum that kmalloc supports.
 *
 * Revision 1.74  2001/02/01 21:23:45  dixonbp
 * Fix in fcntl unlock.
 *
 * Revision 1.73  2001/01/31 17:32:04  dixonbp
 * Remove extra braces in cxiCloseNFS (no functional change).
 *
 * Revision 1.72  2001/01/27 15:42:19  dixonbp
 * NFS fixes to cxiCloseNFS and gpfs_f_lock.  Remove incorrect nfs handling
 * in gpfs_i_validate, and start to handle a nfs problem with gpfs_i_lookup.
 *
 * Revision 1.71  2001/01/25 18:35:51  wyllie
 * Rename panic to cxiPanic, which is implemented as an exported entry point of
 * the mmfslinux module on Linux, and as a macro that invokes panic on AIX.
 *
 * Revision 1.70  2001/01/24 21:24:45  wyllie
 * Make panic() call BUG() to stop the thread
 *
 * Revision 1.69  2001/01/15 14:53:15  dixonbp
 * Add a DBGASSERT, fix some comments.
 *
 * Revision 1.68  2001/01/12 02:55:53  eshel
 * Add code for read by inode.
 *
 * Revision 1.67  2001/01/09 23:12:40  wyllie
 * Get wake_up to compile and run on 2.4.0
 *
 * Revision 1.66  2000/12/15 13:56:36  gjertsen
 * Clean up documentation.
 *
 * Revision 1.65  2000/12/14 20:54:14  wyllie
 * Make cxiWaitEvent be allocated inside of Simple_lock object, instead of
 * malloc'ed elsewhere.  Record owner of Simple_lock as a stack address instead
 * of a thread id.  Fix bug in sleep_on_w_mutex: no traces are allowed
 * between setting current->state to not runnable and calling schedule().
 *
 * Revision 1.64  2000/12/13 18:34:59  wyllie
 * Convert more Simple_locks to cxiBlockingMutex_ts.  Implement
 * cxiBlockingMutexHeldByCaller() instead of lock_mine().
 *
 * Revision 1.63  2000/12/12 16:33:15  wyllie
 * Change e_sleep_thread and cxiEventWakeup... to cxiWaitEventWait and
 * cxiWaitEventSignal/Broadcast.
 *
 * Revision 1.62  2000/12/08 18:35:11  wyllie
 * Add cxiBlockingMutex_t type and operations that act on it.  Uses simple_lock
 * on AIX and binary semaphores on Linux.
 *
 * Revision 1.61  2000/12/06 16:52:39  dcraft
 * write inline versions of cxiString functions for linux kernel
 *
 * Revision 1.60  2000/12/05 18:44:02  wyllie
 * Make cxiIsNFSThread use integer comparisons instead of a string compare
 *
 * Revision 1.59  2000/12/01 02:10:57  schmuck
 * Instead of assigning NULL function pointers when initializing or resetting the
 * gpfs_operations table, have it point to a dummy function that returns ENOSYS.
 * This avoids having to check for NULL before each call.
 *
 * Revision 1.58  2000/11/13 16:45:08  wyllie
 * Clean up simple_lock code and do less tracing
 *
 * Revision 1.57  2000/11/10 18:18:15  wyllie
 * Inline calls to __copy_to/from_user inside of cxiUiomove
 *
 * Revision 1.56  2000/11/07 00:16:20  eshel
 * Add code to support remount.
 *
 * Revision 1.55  2000/11/06 19:56:09  gjertsen
 * Linux code cleanup and put in build safeguards.
 *
 * Revision 1.54  2000/11/02 19:46:19  gjertsen
 * Linux code split. Pull out NBD stuff.
 *
// Revision 1.53  2000/11/02  14:48:55  dixonbp
// Rename ltrace.h to lxtrace.h and move a trace point.
//
 * Revision 1.52  2000/10/31 15:41:53  gjertsen
 * Account for atomic operations in cxi layer.
 *
// Revision 1.51  2000/10/27  01:45:34  wyllie
// Clean up ltrace: Fix _STrace of string arguments that are not the last
// parameter.  Check for buffer overflow when building S or X type traces.
// Shorten header in trace file by eliminating and combining fields.  Make
// sure strings in trace file are null-terminated.  Round string lengths to
// a multiple of the word size.
//
// Revision 1.50  2000/10/26  22:15:23  schmuck
// Put references to nfs debug control variables under ifdef NFS_DEBUG,
// so that loading the gpfs kernel module does not require having loaded
// nfs modules first.
//
// Revision 1.49  2000/10/26  20:52:27  gjertsen
// Purge out ugly USE_CWRAPPERS and export module symbols explicitly
// as the default (in IA64 safe manner).
//
// Revision 1.48  2000/10/26  15:11:50  dixonbp
// Fix to fcntl deadlock detection.
// Validate advObjP in cxiFcntlLock before attempting to lock.
// Create cxiFcntlUnblock to cleanup after an fcntl lock wait
// is interrupted.
//
// Revision 1.47  2000/10/25  16:53:37  wyllie
// Trace include reorganization: split old tasking/Trace.h into five new
// files, and move most of them into directories that will be shipped with
// GPFS on Linux.  Also change the code in trcid.h files generated by
// mktrace to not include AIX specific stuff like direct use of AIX trace
// macros.  Change names of all macros in generated code to have a leading
// underscore.  As a result of these changes, all trcid.h files had to
// be rebuilt.
//
// Revision 1.46  2000/10/24  14:04:44  gjertsen
// Clean up linux module specific code so that gpfs can be
// compiled in the kernel (to allow IA64 kernel debugging).
//
// Revision 1.45  2000/10/20  23:20:13  wyllie
// Use direct calls to functions in gpfs_ops instead of going through a local
// static pointer.
//
// Revision 1.44  2000/10/19  19:11:45  dixonbp
// Platform-specific functions for NFS and watchdog.
//
// Revision 1.43  2000/10/14  00:17:32  wyllie
// Use proper version of lock include file.
//
// Revision 1.42  2000/10/05  15:50:12  wyllie
// Correct a trace
//
// Revision 1.41  2000/10/04  19:52:50  gjertsen
// Only include smplock.h for SMP configuration (to allow use of uniproc kernel).
//
// Revision 1.40  2000/10/02  20:42:19  wyllie
// Clean up multi-platform mutex code.  Add assembler versions of the fast path
// for mutex acquire and release for i386.
//
// Revision 1.39  2000/09/26  23:22:07  wyllie
// Experiments with other methods for doing I/O.  Not ready for prime time.
//
// Revision 1.38  2000/09/20  14:23:09  dixonbp
// cxiFcntlLock fixes required by the change to the 2.4.0-test7 kernel.
//
// Revision 1.37  2000/09/16  21:42:52  eshel
// Remove LATEST_LINUX_KERNEL ifdef code.
//
// Revision 1.36  2000/08/29  18:31:51  dcraft
// Produce mmfs module.
//
// Revision 1.35  2000/08/28  14:13:20  gjertsen
// Need to export all kernel symbols explicitly in IA64
// Linux due to bug with insmod.
//
// Revision 1.34  2000/08/24  22:15:02  wyllie
// Fix cxiWaitEventWakeupOne (and hence condition variable signal) to wake up
// only one waiting thread instead of all of them.  The trick is to set the
// TASK_EXCLUSIVE bit in the process state before calling Linux wake_up.
//
// Revision 1.33  2000/08/21  15:40:38  gjertsen
// Add in 64-bit atomic ops for IA64. Later split into arch dependant dirs.
//
// Revision 1.32  2000/08/11  20:13:34  eshel
// Fixs for compile and load on IA64.
//
// Revision 1.31  2000/08/10  23:05:50  eshel
// Get mmfslinux module to build and load for IA64.
//
// Revision 1.30  2000/08/10  18:51:41  dixonbp
// Add cxiVFSToFlock and cxiFlockToVFS routines to convert from
// generic cxiFlock_t to platform specific fcntl locking structs.
//
// Revision 1.29  2000/08/10  00:01:36  wyllie
// Under TRACE_IO_DATA ifdef, trace prefix of data copied by cxiUiomove.
//
// Revision 1.28  2000/08/08  23:41:51  eshel
// Updates for 2.4.0test4-000715-34 kernel used for ia64 version.
//
// Revision 1.27  2000/08/07  22:33:04  dcraft
// Use new cxiGetTOD() time of day function for second,
// nanosecond needs.  Define 32 bit time types and start
// using in place of current on disk time_t types (necessary
// for 64 bit).  Use HiResTime where possible.
//
// Revision 1.26  2000/08/03  15:43:29  gjertsen
// Keep terminate function out of IA64 for now.
//
// Revision 1.25  2000/08/01  21:26:03  wyllie
// Change tracing of simple_lock to use trace class TRACE_KLOCKL.
//
// Revision 1.24  2000/07/26  19:35:01  dcraft
// e_sleep_thread() return codes changed to be closer to AIX.
// Interrupted sleep on event word now retries sleep in case
// debugger causes wakeup.
// Mutex event words deallocated by awakened thread instead
// of waker thread.  Done due to differing semantics where linux
// threads are responsible for removing themselves from the
// event queue.
//
// Revision 1.23  2000/07/25  16:18:05  gjertsen
// Add in function prototypes and a few type cast fixes.
//
// Revision 1.21  2000/07/21  22:46:33  schmuck
// Fix for compile with -DNDEBUG.
//
// Revision 1.20  2000/07/21  18:55:55  dixonbp
// Fix a bad trace.
//
// Revision 1.19  2000/07/20  15:49:18  dcraft
// Abstract event words for OS independence
//
// Revision 1.18  2000/07/19  13:20:05  gjertsen
// Clean up code by adding typecasts, func prototypes, and misc compile fixes.
// Take care of module.h using `new' keyword for IA64.
//
// Revision 1.17  2000/07/12  01:28:37  schmuck
// Replace "#ifdef LTRACE" with "#ifndef GPFS_PRINTF".
//
// Revision 1.16  2000/07/11  16:35:10  wyllie
// Use cxiUio_t instead of struct uio.  Use cxiUiomove instead of uiomove.  Use
// CXI_READ instead of UIO_READ, etc.
//
// Revision 1.15  2000/06/26  14:42:29  dixonbp
// Create cxiTrace to do the linux-specific trace flag manipulations.
//
// Revision 1.14  2000/06/13  14:48:39  dixonbp
// Add cxiFcntlReset to release all local advisory locks for a filesystem.
// Add traces and a fix to handle unlock 0-0 in cxiFcntlLock.
//
// Revision 1.13  2000/06/12  21:41:51  dixonbp
// Move cxiFlock_t from cxiTypes-plat to cxiTypes.
//
// Revision 1.12  2000/06/08  13:06:19  gjertsen
// Introduce cxi types used at user-level and kernel extension
// interfaces for xmem and uio. Purged out gpfs_flock with cxiFlock_t.
//
// Revision 1.11  2000/06/07  16:54:09  eshel
// Change PDEBUG to TRACE
//
// Revision 1.10  2000/06/06  20:02:36  dixonbp
// Add cxiFcntlLock for linux specific advisory locking on the local node.
//
// Revision 1.9  2000/06/02  21:19:53  eshel
// add Mailbox_kernelOpsDisabled to keep Mailbox status in the module
//
// Revision 1.8  2000/06/02  20:36:14  eshel
// move more code from dirlinux to gpl-linux
//
// Revision 1.7  2000/06/02  16:57:00  eshel
// copy logAssertFailed to the module
//
// Revision 1.6  2000/06/01  22:10:40  eshel
// Move VFSStats to platform dependent directories.
//
// Revision 1.5  2000/05/31  01:06:10  eshel
// final step of module split
//
// Revision 1.4  2000/05/30  23:41:51  eshel
// more for the module split
//
// Revision 1.3  2000/05/30  21:27:47  wyllie
// Use cxi prefix instead of kxi.
//
// Revision 1.2  2000/05/26  23:39:11  wyllie
// Avoid use of atomic_t.  Replace with int.
//
// Revision 1.1  2000/05/19  23:23:09  wyllie
// Use cxiGetThreadId instead of thread_self, GETID, or Thread::currentThreadId.
//
 */

#include <Shark-gpl.h>

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/time.h>
#include <linux/file.h>
#include <linux/string.h>
#include <asm/uaccess.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/vmalloc.h>

#undef memcmp

#define DEFINE_TRACE_GBL_VARS
#include <cxiSystem.h>
#include <cxiAtomic.h>
#include <cxi2gpfs.h>
#include <cxiSharedSeg.h>

#include <linux2gpfs.h>
#include <Trace.h>
#include <lxtrace.h>
#include <cxiMode.h>

#if LINUX_KERNEL_VERSION >= 2040900
/* This is in the Redhat kernel series */
extern int posix_locks_deadlock(struct file_lock *, struct file_lock *);
#endif

#ifdef INSTRUMENT_LOCKS
struct BlockingMutexStats BlockingMutexStatsTable[MAX_GPFS_LOCK_NAMES];
#endif  /* INSTRUMENT_LOCKS */

/* Get the kernel thread ID. */
cxiThreadId cxiGetThreadId()
{
  return current->pid;
}

/* Get the kernel process ID. */
pid_t getpid()
{
  return current->pid;
}

/* finds a task_struct by pid */
static struct task_struct *find_task(pid_t pid)
{
  return find_task_by_pid(pid);
}

#ifdef CONFIG_NSS
#include <linux/capability.h>

/* Find out if a given process ID is active */
Boolean cxiIsValidPID(pid_t pid)
{
  if (find_task(pid)) return true;
  return false;
}

/* Get the group list for current process */
gid_t * cxiGetGroupList(int *numgroups)
{
  *numgroups = current->ngroups;
  return current->groups;
}

/* Get the fsuid for current process */
int cxiGetFSUid()
{
  return current->fsuid;
}

/* Get the fsgid for current process */
int cxiGetFSGid()
{
  return current->fsgid;
}

/* Get the group list for a given process ID */
gid_t * cxiGetGroupListPID(pid_t pid, int *numgrps)
{
  struct task_struct *p = find_task(pid);
  if (p == NULL)
  {
    *numgrps = -1;
    return NULL;
  }

  *numgrps = p->ngroups;
  return p->groups;
}

int cxiSetGroupListPID(pid_t pid, gid_t *groups, int numgrps)
{
  struct task_struct *p = find_task(pid);
  if (p == NULL) return -1;
  memcpy(p->groups, groups, numgrps*sizeof(gid_t));
  p->ngroups = numgrps;
  return 0;
}

int cxiSetPAG(pid_t pid, gid_t g1, gid_t g2)
{
  struct task_struct *p = find_task(pid);
  if (p == NULL) return -1;
  p->groups[0] = g1;
  p->groups[1] = g2;
  return 0;
}

int cxiIsCapable(int type)
{
  return capable(type);
}
#endif

Boolean
cxiIsKswapdThread()
{
  return (strcmp(current->comm, "kswapd") == 0);
}

/* Convert a kernel stack address to the thread ID of the thread that uses
   that stack */
int cxiStackAddrToThreadId(char* stackP, cxiThreadId* tidP)
{
  struct task_struct * tP;

  tP = (struct task_struct *) ((UIntPtr)stackP & ~((UIntPtr)(THREAD_SIZE-1)));
  *tidP = tP->pid;
  return 0;
}

/* Convert a kernel thread pointer to the corresponding thread ID */
int cxiThreadPtrToThreadId(char* threadP, cxiThreadId* tidP)
{
  struct task_struct * tP;

  DBGASSERT((UIntPtr)threadP  & ((UIntPtr)(THREAD_SIZE-1)) == 0);
  tP = (struct task_struct *) threadP;
  *tidP = tP->pid;
  return 0;
}


/* This function could be used to set yourself up to receive
 * a signal in the case of low paging space.  For AIX we do this
 * by setting up a signal handler for SIGDANGER.  For Linux you
 * can set this odd capability and that will cause SIGTERM to
 * be thrown.  Unfortunately if you get low on paging space in
 * Linux, the signal will get thrown but the selection criteria
 * for the killable process will not change.  It would thus
 * get in a hard loop trying to signal what it thinks is the best
 * process.   There doesn't appear to be a good solution in Linux
 * to avoid being killed and hence this is currently unused.
 */
int
cxiLowPagingNotify(pid_t pid)
{
  int pgrp;
  struct task_struct *p = find_task(pid);

  if (p == NULL)
    return -ENOENT;

  pgrp = p->pgrp;
  read_lock(&tasklist_lock);
  for_each_task(p)
  {
    if (p->pgrp != pgrp)
      continue;
    p->cap_permitted |= CAP_SYS_RAWIO;
    p->cap_effective |= CAP_SYS_RAWIO;
  }
  read_unlock(&tasklist_lock);

  return 0;
}


/* Return true if caller has has maximum authorization (is root) */
Boolean cxiIsSuperUser()
{
  return (current->euid == 0);
}

/* Allocate pinned kernel memory */
void* cxiMallocPinned(int nBytes)
{
  /* kmalloc only supports requests for up to 131027 bytes.  Anything
     larger than this results in a BUG() call. */
  if (nBytes > 131072)
    return NULL;

  return kmalloc(nBytes, GFP_KERNEL);

}

/* Free pinned kernel memory that was allocated with cxiMallocPinned */
/* Must not block on lack of memory resourses */
void cxiFreePinned(void* p)
{
  kfree(p);
}

/* Get the kernel thread ID. */
void* cxiGetFcntlOwner(eflock_t *flP)
{
  return flP? flP->l_owner: current->files;
}

/* Perform local advisory locking. */
int cxiFcntlLock(void *advObjP,
                 int cmd,
                 void *lockStructP,
                 cxiFlock_t *flockP,
                 int (*retryCB)(),
                 cxiOff64_t size,
                 cxiOff64_t offset,
                 ulong *retry_idP)
{
  int rc = 0;
  // struct file *fP;
  struct file_lock fl, *flP, *gflP, *cflP;
  Boolean keepLockElement = false;

  /* cast platform independent arguments as appropriate for linux */
  void (*RetryFcn)(struct file_lock*) = (void (*)(struct file_lock*))retryCB;
  // fP = (struct file *)advObjP;
  struct file localFile;
  struct dentry localDEntry;
  flP = (struct file_lock *) lockStructP;

  localFile.f_dentry = &localDEntry;
  localDEntry.d_inode = (struct inode *)advObjP;

  /* Callers have the option of passing a platform dependent lock structure
     (struct file_lock *lockSructP) or the generic (cxiFlock_t *flockP). */
  if (flockP)
  {
    flP = &fl; /* Use a local file_lock structure */

    /* If there is a potential for blocking, must malloc the locking structure
       so it can persist until the lock becomes available (in Retry()). */

    if (cmd == F_SETLKW)
    {
      flP = (struct file_lock*)cxiMallocUnpinned(sizeof(struct file_lock));
      if (flP == NULL)
      {
	rc = ENOMEM;
	goto exit;
      }
    }

    cxiMemset(flP, 0, sizeof(*flP));
    locks_init_lock(flP); /* Initialize list_head structs */

    flP->fl_flags = FL_POSIX;
    flP->fl_file  = NULL; /* don't pass stack file struct */

    /* fl_wait needs to be initialized because when unlock happens, the
       linux routine locks_wake_up_blocks invokes our retry routine via
       fl_notify and then calls wake_up(fl_wait) on the assumption that
       the waiter is local. */

    cxiWaitEventInit((cxiWaitEvent_t *)&flP->fl_wait);

    cxiFlockToVFS(flockP, flP);
  }

  /* daemon didn't know the owner and required kernel code to fill it in. */
  if (!flP->fl_owner)
    flP->fl_owner = (fl_owner_t)cxiGetFcntlOwner(NULL);

#if 0
  /* Validate the file pointer.  Kernel locking routines are going to
     use these without verifying them.  If any of them are NULL, find
     out now before they generate a segment violation. */
  if ((!fP) || (!fP->f_dentry) || (!fP->f_dentry->d_inode))
  {
    if (cmd == F_GETLK)
      flP->fl_type = F_UNLCK;
    else
      rc = EINVAL;
    goto exit;
  }
#endif

  /* Note that this all depends on us having serialized such locking for
     this file during from before the posix_test_lock() until after the
     posix_block_lock().  The revoke lock that we hold here provides us
     the necessary serilization. */

  TRACE7(TRACE_VNODE, 3, TRCID_FCNTLLOCK_ENTER,
         "cxiFcntlLock posix_lock_file: pid %d owner 0x%X inodeP 0x%X "
         "range 0x%lX-%lX cmd %s type %s\n",
         flP->fl_pid, flP->fl_owner, advObjP, flP->fl_start, flP->fl_end,
         (cmd == F_GETLK) ? "GETLK" : (cmd == F_SETLK) ? "SETLK" : "SETLKW",
         (flP->fl_type == F_RDLCK) ? "RDLCK" :
         (flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK");

  if (cmd == F_GETLK)
  {
    /* Check for conflicts.  If found, return the information.
       If there are NO conflicts, return F_UNLCK in fl_type. */
    if (NULL != (gflP = posix_test_lock(&localFile, flP)))
    {
      flP->fl_start = gflP->fl_start;
      flP->fl_end = gflP->fl_end;
      flP->fl_type = gflP->fl_type;
      flP->fl_pid = gflP->fl_pid;
      flP->fl_owner = gflP->fl_owner;
    }
    else
      flP->fl_type = F_UNLCK;

    TRACE6(TRACE_VNODE, 3, TRCID_FCNTLLOCK_GETLK,
           "cxiFcntlLock getlk: pid %d owner 0x%X inodeP 0x%X "
           "range 0x%lX-%lX type %s\n",
           flP->fl_pid, flP->fl_owner, advObjP, flP->fl_start, flP->fl_end,
           (flP->fl_type == F_RDLCK) ? "RDLCK" :
           (flP->fl_type == F_WRLCK) ? "WRLCK" : "UNLCK");
  }
  else
  { /* Begin: do the locking, but handle the blocking via our retry routine. */
    /* Test the lock.   What this really does for us is return the blocker
       if one exists.  This is needed to queue up the request if a conflicting
       lock is already held. */

    if ((flP->fl_type == F_UNLCK) || !(cflP = posix_test_lock(&localFile, flP)))
    {
      /* No conflicting lock:  get the lock for the caller. */

      rc = posix_lock_file(&localFile, flP, 0);
    }
    else
    { /* Conflicting lock:  ..... */
      rc = EAGAIN;

      if (cmd == F_SETLKW)
      {
#if !NFS4_LINUX_2_4_4 || NFS4_LINUX_PATCH
        if (posix_locks_deadlock(flP, cflP))
        {
          rc = EDEADLK;
        }
        else
#endif
        {
          /* Queue the blocker structures */
          keepLockElement = true;
          if (retry_idP)
            *retry_idP = (ulong)flP; // returned to caller and saved in sleepElement
          flP->fl_notify = RetryFcn;
          posix_block_lock(cflP, flP);
        }
      }
    }

    TRACE2(TRACE_VNODE, 3, TRCID_FCNTLLOCK_EXIT,
           "cxiFcntlLock posix_lock_file: rc 0x%X retry_id 0x%lX\n", rc, cflP);
  } /* End: do the locking, but handle the blocking via our retry routine. */

exit:

  if (flockP)
  {
    /* Caller wanted results in flockP */
    cxiVFSToFlock((void *)flP, flockP);

    /* If we allocated the locking structure and then didn't need to use
       it (the lock request didn't block), free it. */

    if ((flP!=&fl) && (!keepLockElement))
      cxiFreeUnpinned(flP);
  }

  return rc;
}

void cxiFcntlUnblock(void *retry_idP)
{
  struct file_lock *flP = (struct file_lock *)retry_idP;

  /* Include some sanity checks on the retry id (file_lock)
     before passing it into the routine that does the work.
     It should be properly linked (via its list_head structures)
     in a file_lock_list that has blocked waiters.  Also,
     we would only be backing this out by the process that
     has originally blocked, so verify the pid. */

  if (!list_empty(&flP->fl_block) && !list_empty(&flP->fl_link)  &&
       flP->fl_next && flP->fl_pid == getpid())
  {
    posix_unblock_lock(flP);
  }
}

int
cxiFcntlReset(void *vfsP, cxiPid_t mmfsd_pid)
{
  int rc = 0;
  struct super_block *sbP = (struct super_block *)vfsP;
  struct list_head *fllP;
  struct file_lock *fl;
  struct dentry *dentryP;

  lock_kernel();

restart:
  fllP = file_lock_list.next;

  while(fllP != &file_lock_list)
  {
    fl = list_entry(fllP, struct file_lock, fl_link);
    fllP = fllP->next;

    /* If there are mmfs lock structures, release them. */

    if (fl &&
        fl->fl_file &&
        fl->fl_file->f_dentry &&
        fl->fl_file->f_dentry->d_inode)
    {
      dentryP = fl->fl_file->f_dentry;

      /* If this lock belongs to the specified vfs, release advisory locks. */
      if (dentryP->d_sb == sbP)
      {
        /* remove all our locks */
        rc = gpfs_ops.gpfsFcntlReset((void *)dentryP->d_inode, mmfsd_pid);
        if (rc == ENOSYS)
          goto xerror;

        /* After freeing unknown numbers of locks in gpfsFcntlReset (all
           locks for the inode), restart from the top of the lock list */
        goto restart;
      }
    }
  }

xerror:
  unlock_kernel();
  return rc;
}

void *
cxiGetPrivVfsP(void *vfsP)
{
  struct super_block *sbP = (struct super_block *)vfsP;
  return (sbP->u.generic_sbp);
}


#ifdef NFS_DEBUG
/* These flags are defined in the kernel and control various cprintk
   calls.  This provides us a way to easily turn these on/off for
   debugging our NFS support. */
extern unsigned int nlm_debug;
extern unsigned int nfsd_debug;
extern unsigned int nfs_debug;
extern unsigned int rpc_debug;
#endif

int cxiTrace(cxiTrace_t trace)
{
#ifdef NFS_DEBUG
  int rc = 0;

  switch (trace)
  {
    case cxiTraceNFS:
      nlm_debug = nfsd_debug = nfs_debug = rpc_debug = ~0;
      break;
    case cxiTraceNFSoff:
      nlm_debug = nfsd_debug = nfs_debug = rpc_debug =  0;
      break;
    default:
      rc = EINVAL;
      break;
  }
  return rc;
#else
  return ENOSYS;
#endif
}

void cxiFlockToVFS(eflock_t* lckdatP, void* vP)
{
  struct file_lock* flP = (struct file_lock *)vP;

  if ((flP) && (lckdatP))
  {
    flP->fl_pid   = lckdatP->l_pid;
    flP->fl_owner = lckdatP->l_owner;
    flP->fl_type  = lckdatP->l_type;
    flP->fl_start = lckdatP->l_start;
    flP->fl_flags = FL_POSIX;
    if (lckdatP->l_caller == L_CALLER_LOCKD)
      flP->fl_flags |= FL_LOCKD;
    if (lckdatP->l_len == 0)
      flP->fl_end = FL_OFFSET_MAX;
    else
      flP->fl_end = lckdatP->l_len + lckdatP->l_start - 1;
  }
  return;
}

void cxiVFSToFlock(void *vP, eflock_t *lckdatP)
{
  struct file_lock* flP = (struct file_lock *)vP;

  if ((flP) && (lckdatP))
  {
    lckdatP->l_pid    = flP->fl_pid;
    lckdatP->l_owner  = flP->fl_owner;
    lckdatP->l_type   = flP->fl_type;
    lckdatP->l_start  = flP->fl_start;
    if (flP->fl_flags & FL_LOCKD)
      lckdatP->l_caller = L_CALLER_LOCKD;
    else
      lckdatP->l_caller = L_CALLER_NULL;
    if (flP->fl_end == FL_OFFSET_MAX)
      lckdatP->l_len = 0;
    else
      lckdatP->l_len    = flP->fl_end - flP->fl_start + 1;
  }
  return;
}


/* Sleep for the indicated number of milliseconds */
void cxiSleep(int ms)
{
  TRACE1(TRACE_VNODE, 9, TRCID_SLEEP,
         "cxiSleep: begin delay %d\n", ms);
  current->state = TASK_INTERRUPTIBLE;
#if (HZ > 1000)
  /* For large HZ rearrange jiffies calculation and
     use presumably larger word size to minimize overflow risk */
  schedule_timeout(((long)ms)*HZ/1000);
#else
  schedule_timeout(ms/(1000/HZ));
#endif
  TRACE2(TRACE_VNODE, 9, TRCID_SLEEP_END,
         "cxiSleep: end delay %d HZ %d\n", ms, HZ);
}


void cxiOpenNFS(void *iP)
{
  struct inode *inodeP = (struct inode *)iP;
  int refcount;

  /* A reference is placed on the cxiNode here when the first NFS reference
     is added */
  refcount = cxiRefOSNode(NULL, ((cxiNode_t *)(cxiGetCnP(inodeP))), iP, 1);

  TRACE7(TRACE_VNODE, 3, TRCID_OPENNFS,
        "openNFS iP 0x%lX ino %d (0x%X) mode 0x%X nlink %d gen_ip 0x%lX "
        "refcount %d\n",
        inodeP, (inodeP) ? inodeP->i_ino : -1,
        (inodeP) ? inodeP->i_ino : -1,
        (inodeP) ? inodeP->i_mode : -1,
        (inodeP) ? inodeP->i_nlink : -1,
        (inodeP) ? inodeP->u.generic_ip : NULL,
        refcount);

  DBGASSERT(refcount != 0);
}


int cxiCloseNFS(void *vP, void *viP)
{
  int rc;
  struct inode *iP = (struct inode *)vP;

  /* If viP is NULL, the file was never actually opened.
     If viP is not NULL, close it. */
  if (viP == NULL)
    rc = 0;
  else
    rc = gpfs_ops.gpfsClose(VP_TO_PVP(iP), VP_TO_CNP(iP), FREAD|FWRITE,
                            (struct MMFSVInfo *)viP, true);

  cxiPutOSNode((void *)iP);

  return rc;
}

void * cxiGetNfsP(void *vP)
{
  if (vP && VP_TO_CNP((struct inode *)vP))
    return VP_TO_NFSP((struct inode *)vP);
  else
    return NULL;
}

void cxiSetNfsP(void *vP, void *newP)
{
  if (VP_TO_CNP((struct inode *)vP))
    VP_TO_NFSP((struct inode *)vP) = newP;
}

void * cxiGetCnP(void *vP)
{ return (void *)VP_TO_CNP((struct inode *)vP); }

void * cxiGetPvP(void *vP)
{ return (void *)VP_TO_PVP((struct inode *)vP); }

void * cxiGNPtoVP(void *vP)
{ return (void *)GNP_TO_VP((struct cxiNode_t *)vP); }

void 
cxiSetProcessName(char *nameP)
{
  lock_kernel();

  if (strlen(nameP) <= 16)
    sprintf(current->comm, nameP);
  else
    strncpy(current->comm, nameP, 16);
  
  unlock_kernel();
}

/* turn myself into a daemon */
void 
cxiSetProcessGroup()
{
  lock_kernel();

  exit_mm(current);
  exit_files(current);

  current->session = 1;
  current->pgrp = 1;

  unlock_kernel();
}

/* function invoked by keventd */
static void 
startKProc(void *argP)
{
  int karg = 0;
  cxiPid_t pid;
  cxiKProcData_t *kpdP = (cxiKProcData_t *)argP;

  if (kpdP->kargP)
    karg = PTR_TO_INT32(kpdP->kargP);

  pid = kernel_thread(kpdP->func, kpdP, karg);

  cxiBlockingMutexAcquire(&kpdP->lock);
  if (pid > 0)
    kpdP->pid = pid;
  else
    kpdP->pid = KPROC_FAILED_PID;

  cxiWaitEventSignal(&kpdP->startStopEvent);
  cxiBlockingMutexRelease(&kpdP->lock);
}

cxiPid_t 
cxiStartKProc(struct cxiKProcData_t *kpdP)
{
  struct tq_struct tq;

  /* Have the new thread created by keventd.  We use
   * keventd to clean him up also.
   */
  INIT_LIST_HEAD(&tq.list);
  tq.routine = startKProc;
  tq.data = kpdP;
  tq.sync = 0;
  DBGASSERT(kpdP->pid == KPROC_UNASSIGNED_PID);

  schedule_task(&tq);

  cxiBlockingMutexAcquire(&kpdP->lock);

  while (kpdP->pid == KPROC_UNASSIGNED_PID)
    cxiWaitEventWait(&kpdP->startStopEvent, &kpdP->lock, 0);

  cxiBlockingMutexRelease(&kpdP->lock);

  TRACE2(TRACE_VNODE, 1, TRCID_CXISTART_KPROC_LINUX,
         "cxiStartKProc %s pid %d \n", kpdP->nameP, kpdP->pid);
  return kpdP->pid;
}

void
cxiStopKProc(struct cxiKProcData_t *kpdP)
{
  struct task_struct *p;
  cxiPid_t pid;
  Boolean stillAlive;

  cxiBlockingMutexAcquire(&kpdP->lock);
  
  TRACE2(TRACE_VNODE, 1, TRCID_CXISTOP_KPROC_LINUX,
         "cxiStopKProc: %s pid %d \n", kpdP->nameP, kpdP->pid);

  if (!KPROC_RUNNING(kpdP))
  {
    cxiBlockingMutexRelease(&kpdP->lock);
    return;
  }

  pid = kpdP->pid;

  kpdP->terminate = true;
  cxiWaitEventSignal(&kpdP->kprocEvent);

  while (kpdP->pid != KPROC_UNASSIGNED_PID)
    cxiWaitEventWait(&kpdP->startStopEvent, &kpdP->lock, 0);

  cxiBlockingMutexRelease(&kpdP->lock);

  /* notify keventd to clean the process up */
  kill_proc(2, SIGCHLD, 1);

#if 0
  /* For some reason we reload the GPFS module (on a restart) 
   * only to find our kproc from the old module still isn't dead.
   * It will run again (perhaps as part of the final death throes)
   * and take a page fault on code that has been unloaded.  Thus
   * we're not leaving until it dies.
   */
  while(1)
  {
    read_lock(&tasklist_lock);
    p = find_task_by_pid(pid);

    if (p && strcmp(kpdP->nameP, p->comm) == 0)
    {
      read_unlock(&tasklist_lock);

      TRACE2(TRACE_VNODE, 1, TRCID_CXISTOP_KPROC_STILL_ALIVE,
             "cxiStopKProc: %s pid %d is still alive!\n", 
             kpdP->nameP, pid);
      cxiSleep(1000);
    }
    else
    {
      read_unlock(&tasklist_lock);
      break;
    }
  }
#endif
}

/*-------------------------------------------------------------------
 * logAssertFailed  - Subroutine consolidating logGenIF() and
 *                    DoPanic() calls.
 *------------------------------------------------------------------*/

static char PanicMsgBuf[2048];

void cxiPanic(const char* panicStrP)
{
  printk( GPFS_NOTICE  "kp %d: cxiPanic: %s\n", cxiGetThreadId(), panicStrP);
  TRACE1(TRACE_ERRLOG, 0, TRCID_PANIC, "cxiPanic: %s\n", panicStrP);
#ifndef DISABLE_KERNEL_PANIC
  BUG();
#endif
}

static void
DoPanic(char* condP, char* filenameP, int lineNum, Int32 retCode,
        Int32 reasonCode, char *dataStr)
{
  const char *p;
  int bytesLeft;

  p = cxiStrrchr(filenameP, '/');
  if (p == NULL)
    p = filenameP;
  else
    p += 1;

  sprintf(PanicMsgBuf, "%s:%d:%d:%d:", p, lineNum, retCode, reasonCode);
  bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf);
  if (dataStr)
  {
    strncat(PanicMsgBuf, dataStr, bytesLeft-1);
    bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf);
  }
  strncat(PanicMsgBuf, ":", bytesLeft-1);
  bytesLeft = sizeof(PanicMsgBuf) - strlen(PanicMsgBuf);
  if (condP)
    strncat(PanicMsgBuf, condP, bytesLeft-1);
  cxiPanic(PanicMsgBuf);
}

#ifdef MODULE
void
logAssertFailed(UInt32 flags,   /* LOG_FATAL_ERROR or LOG_NONFATAL_ERROR */
                char  *srcFileName,   /* __FILE__ */
                UInt32 srcLineNumber, /* __LINE__ */
                Int32  retCode,       /* return code value */
                Int32  reasonCode,    /* normally errno */
                UInt32 logRecTag,     /* tag if have associated error log rec */
                char  *dataStr,       /* assert data string */
                char  *failingExpr)   /* expression that evaluated to false */
{
  int i;

  printk("GPFS logAssertFailed: %s file %s line %d\n",
         failingExpr, srcFileName, srcLineNumber);
  TRACE3(TRACE_ERRLOG, 0, TRCID_MODULE_LOGASSERT_1,
         "logAssertFailed: %s retCode %d reasonCode %d\n",
         failingExpr, retCode, reasonCode);
  TRACE2(TRACE_ERRLOG, 0, TRCID_MODULE_LOGASSERT_2,
         "logAssertFailed: file %s line %d\n", srcFileName, srcLineNumber);
#ifndef GPFS_PRINTF
  /* fsync buffered lxtrace records */
  trc_fsync();

#ifdef STOP_TRACE_ON_FAILURE
  /* Turn off tracing right after the failure occurs.  This may only turn
     off tracing in the kernel. */
  for (i=0 ; i<MAX_TRACE_CLASSES ; i++)
    TraceFlagsP[i] = 0;
#endif

  /* Wait 10 seconds to allow the lxtrace daemon to complete the sync. */
  cxiSleep(10000);
#endif
  gpfs_ops.gpfsDaemonToDie(srcFileName, srcLineNumber, retCode, reasonCode,
                           dataStr, failingExpr);

  DoPanic(failingExpr, srcFileName, srcLineNumber, retCode, reasonCode,
          dataStr);
}
#else /* !MODULE */
void
logAssertFailed(UInt32 flags,
                char  *srcFileName,
                UInt32 srcLineNumber,
                Int32  retCode,
                Int32  reasonCode,
                UInt32 logRecTag,
                char  *dataStr,
                char  *failingExpr);
#endif /* MODULE */


typedef struct cxiWaitElement_t
{
  cxiWaitList_t waitList;  /* previous and next element in chain */

  /* Linux would normally organize a wait_queue_head_t with any number
   * of wait_queue_t elements.  However since we're implementing "wakeup
   * with return code" we have to ensure the OS wakes up the exact sleeper
   * we want.  Thus we have only a one to one relationship to ensure the
   * OS can only pick our favorite.
   */
  wait_queue_head_t qhead;
  wait_queue_t qwaiter;
  int wakeupRC;            /* wakeup return code */

} cxiWaitElement_t;


#define CXI_WAIT_LIST_ADD(headP, elementP) \
   (headP)->prevP->nextP = (elementP); \
   (elementP)->prevP = (headP)->prevP; \
   (headP)->prevP = (elementP);        \
   (elementP)->nextP = (headP);

#define CXI_WAIT_LIST_REMOVE(elementP) \
   (elementP)->prevP->nextP = (elementP)->nextP; \
   (elementP)->nextP->prevP = (elementP)->prevP;


void
lock_alloc(Simple_lock* lockP, int opt, int type, int occurrence)
{
  lockP->slCount = -1;
}

/* Initialize abstract wait event with OS specific
 * initialization function
 */
void
cxiWaitEventInit(cxiWaitEvent_t *weP)
{
  spinlock_t *lockP = (spinlock_t *)&weP->lword;

  *lockP = SPIN_LOCK_UNLOCKED;
  weP->waitList.nextP = weP->waitList.prevP = &weP->waitList;
}

Boolean
cxiWaitEventHasWaiters(cxiWaitEvent_t *weP)
{
  return (weP->waitList.nextP != &weP->waitList);
}

void
simple_lock_init(Simple_lock* lockP)
{
  lockP->slCount = 0;
  lockP->slState = 0;
  lockP->slOwnerP = NULL;
  cxiWaitEventInit(&lockP->slEvent);
}

void
lock_free(Simple_lock* lockP)
{
  lockP->slCount = 0;               /* assert that it is 0 ??? */
  lockP->slState = 0;
  lockP->slOwnerP = NULL;
  LOGASSERT(!cxiWaitEventHasWaiters(&lockP->slEvent));
}

/* thread context only */
void
wait_on_lock(Simple_lock* slockP)
{
  cxiWaitEvent_t *weP = &slockP->slEvent;
  spinlock_t *lockP = (spinlock_t *)(weP->lword);
  unsigned long flags;
  cxiWaitElement_t waitElement;

  DBGASSERT(!in_interrupt());

  ATOMIC_ADD(&slockP->slCount, 1);

  /* initialize our wait element */
  init_waitqueue_head(&waitElement.qhead);
  init_waitqueue_entry(&waitElement.qwaiter, current);
  __add_wait_queue(&waitElement.qhead, &waitElement.qwaiter);
  waitElement.wakeupRC = 0;

  /* add our wait element to the end of the wait list */
  wq_write_lock_irqsave(lockP, flags);
  CXI_WAIT_LIST_ADD(&weP->waitList, &waitElement.waitList);
  wq_write_unlock_irqrestore(lockP, flags);

repeat:
  current->state = TASK_UNINTERRUPTIBLE;
  if (test_bit(0, &slockP->slState))
  {
    schedule();
    goto repeat;
  }
  current->state = TASK_RUNNING;

  /* remove ourself from the wait list */
  wq_write_lock_irqsave(lockP, flags);
  CXI_WAIT_LIST_REMOVE(&waitElement.waitList);
  wq_write_unlock_irqrestore(lockP, flags);

  ATOMIC_ADD(&slockP->slCount, -1);
}

/* thread context only */
void
simple_lock(Simple_lock* lockP)
{
  TRACE3(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_001,
         "simple_lock: count %d state %d owner 0x%lX\n",
         lockP->slCount, lockP->slState, lockP->slOwnerP);

  while (test_and_set_bit(0, &lockP->slState))
    wait_on_lock(lockP);
  lockP->slOwnerP = (char*)&lockP;

  TRACE3(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_003,
         "simple_lock out: count %d state %d owner 0x%lX\n",
         lockP->slCount, lockP->slState, lockP->slOwnerP);
}

/* If you're serializing between thread/interrupt contexts,
 * both must use disable_lock/unlock_enable.  These routines
 * raise and lower the interrupt priority (ensuring serialization
 * on a single processor machine) and lock/unlock the lock word for MP
 * serialization.  Attempting to mix simple_lock (by process)
 * and disable_lock (by interrupt) on the same lock word would
 * not disable interrupts for the process.  Thus if the process
 * held the lock word and was interrupted on the same processor
 * and the interrupt context attempted a disable_lock, the
 * interrupt context would spin forever on the lock word.
 */
int
disable_lock(int intr, Simple_lock* lockP)
{
  unsigned long flags = 0;
  Boolean process = (in_interrupt() ? false : true);

  local_irq_save(flags);
  if (process)
  {
    while (test_and_set_bit(0, &lockP->slState))
      wait_on_lock(lockP);
  }
  else
  {
    /* We better be on an MP machine, otherwise we'll
     * block this processor spinning in interrupt state.
     */
    while (test_and_set_bit(0, &lockP->slState));
  }

  lockP->slOwnerP = (char*)&lockP;
  return (int)flags;
}

/* Do not add trace records.  Some callers depend on not being
 * interrupted by the trace daemon.
 */
static inline void
cxiWakeup(cxiWaitEvent_t *wEventP, Boolean wakeAll, int wakeupRC)
{
  unsigned long flags;
  spinlock_t *lockP = (spinlock_t *)(wEventP->lword);
  cxiWaitList_t *headP;
  cxiWaitList_t *tmpP;
  cxiWaitElement_t *wP;

  wq_write_lock_irqsave(lockP, flags);

  /* We wake up from the front back (FIFO semantics).
   * There's only one wait element per wake_queue_head_t so
   * record the return code and wake up the one element.
   */
  headP = &wEventP->waitList;
  tmpP = headP->nextP;

  while (tmpP != headP)
  {
    wP = list_entry(tmpP, cxiWaitElement_t, waitList);
    wP->wakeupRC = wakeupRC;
    tmpP = tmpP->nextP;

    wake_up(&wP->qhead);
    if (wakeAll == false)
      break;
  }
  wq_write_unlock_irqrestore(lockP, flags);
}

/* thread context only */
void
simple_unlock(Simple_lock* lockP)
{
  TRACE3(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_SUNLOCK,
         "simple_unlock: count %d state %d owner 0x%lX\n",
         lockP->slCount, lockP->slState, lockP->slOwnerP);

  /* if bit is not set something is wrong */
  DBGASSERT(test_bit(0, &lockP->slState));

  lockP->slOwnerP = NULL;
  clear_bit(0, &lockP->slState);

  cxiWakeup(&lockP->slEvent, false, 0); /* wake up one */

  TRACE3(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_SUNLOCK_EXIT,
         "simple_unlock out: count %d state %d owner 0x%lX\n",
         lockP->slCount, lockP->slState, lockP->slOwnerP);
}

static inline void
simple_unlock_nomsg(Simple_lock* lockP)
{
  /* if bit is not set something is wrong */
  DBGASSERT(test_bit(0, &lockP->slState));

  lockP->slOwnerP = NULL;
  clear_bit(0, &lockP->slState);

  cxiWakeup(&lockP->slEvent, false, 0); /* wake up one */
}

/* thread or interrupt context unlock and enable interrupts */
void
unlock_enable(int intr, Simple_lock* lockP)
{
  /* if bit is not set something is wrong */
  DBGASSERT(test_bit(0, &lockP->slState));
  DBGASSERT(lockP->slOwnerP != NULL);

  lockP->slOwnerP = NULL;
  clear_bit(0, &lockP->slState);

  local_irq_restore(intr);

  cxiWakeup(&lockP->slEvent, false, 0); /* wake up one */
}


int
e_sleep_thread(cxiWaitEvent_t* weP, Simple_lock *sl, int waitFlags)
{
  spinlock_t *lockP = (spinlock_t *)(weP->lword);
  unsigned long flags;
  cxiWaitElement_t waitElement;
  int count = 0;
  Boolean done;

  TRACE3(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_SLEEP_THREAD_ENTER,
         "e_sleep_thread enter: weP 0x%lX sl 0x%lX waitFlags 0x%X\n",
         weP, sl, waitFlags);

  /* initialize our wait element */
  init_waitqueue_head(&waitElement.qhead);
  init_waitqueue_entry(&waitElement.qwaiter, current);
  __add_wait_queue(&waitElement.qhead, &waitElement.qwaiter);
  waitElement.wakeupRC = 0;

  /* update our task state to not running any more */
  if (waitFlags & INTERRUPTIBLE)
    current->state = TASK_INTERRUPTIBLE;
  else
    current->state = TASK_UNINTERRUPTIBLE;

  /* add our wait element to the end of the wait list */
  wq_write_lock_irqsave(lockP, flags);
  CXI_WAIT_LIST_ADD(&weP->waitList, &waitElement.waitList);
  wq_write_unlock_irqrestore(lockP, flags);

  /* Release the lock.  Note: we need to use the unlock routine that does
     not make trace calls.  See comment in cxiWaitEventWait for details. */
  simple_unlock_nomsg(sl);

again:
  /* call the scheduler */
  schedule();

  /* Remove ourself from the wait list.
     See comment in cxiWaitEventWait for an explanation of the special
     case of kswapd being interrupted in an uninterruptible wait. */
  wq_write_lock_irqsave(lockP, flags);
  if (waitElement.wakeupRC == 0 &&
      !(waitFlags & INTERRUPTIBLE) &&
      cxiIsKswapdThread() &&
      count++ < 50)
  {
    current->state = TASK_UNINTERRUPTIBLE;
    done = false;
  }
  else
  {
    CXI_WAIT_LIST_REMOVE(&waitElement.waitList);
    done = true;
  }
  wq_write_unlock_irqrestore(lockP, flags);

  if (!done)
    goto again;

  /* re-acquire the lock */
  simple_lock(sl);

  TRACE3(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_SLEEP_THREAD_EXIT,
         "e_sleep_thread exit: weP 0x%lX sl 0x%lX rc %d\n",
         weP, sl, waitElement.wakeupRC);

  /* A zero wakeup code means we were interrupted rather than woken up */
  if (waitElement.wakeupRC != 0)
    return waitElement.wakeupRC;
  else
    return THREAD_INTERRUPTED;
}

void
cxiWaitEventWakeup(cxiWaitEvent_t* weP)
{
  TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_WAKEUP,
         "cxiWaitEventWakeup: weP 0x%lX\n", weP);

  cxiWakeup(weP, true, THREAD_AWAKENED); /* wake up all */
}

void
cxiWaitEventWakeupOne(cxiWaitEvent_t* weP)
{
  TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_WAKEUP_ONE,
         "cxiWaitEventWakeupOne: weP 0x%lX\n", weP);

  cxiWakeup(weP, false, THREAD_AWAKENED); /* wake up one */
}

/* Provide a result on wakeup of thread */
void
cxiWaitEventWakeupResult(cxiWaitEvent_t* weP, int rc)
{
  TRACE2(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_WAKEUP_RESULT,
         "cxiWaitEventWakeupResult: weP 0x%lX rc %d\n", weP, rc);

  cxiWakeup(weP, true, rc); /* wake up all */
}


int
cxiCopyIn(char *from, char *to, unsigned long size)
{
  return __copy_from_user(to, from, size);
}


int
cxiCopyOut(char *from, char *to, unsigned long size)
{
  return __copy_to_user(to, from, size);
}

int
cxiCopyInstr(char *from, char *to, unsigned long size, unsigned long *len)
{
  long retval;
  retval = strncpy_from_user(to, from, size);
  if ((retval > 0) && (retval <= size))
  {
    *len = retval;
    return 0;
  }
  *len = 0;
  if (retval < 0)
    retval = EFAULT;
  else
    retval = ENAMETOOLONG;
  return (int)retval;
}

int socket_aio_dequeue()
{
  return -1;
}

/* Transfer data from buffer(s) in user space to or from a buffer in the
   kernel. */
int
cxiUiomove(register char* kBufP,          /* address of kernel buffer */
           register unsigned long nBytes, /* #bytes to transfer */
           Boolean toKernel,              /* direction of xfer(read/write)*/
           register struct cxiUio_t* uioP) /* user area description */
{
  register struct cxiIovec_t * iovP;
  unsigned long cnt;
  int rc;
#ifdef TRACE_IO_DATA
  char* origKBufP = kBufP;
  int trcdata[4];
#endif

  TRACE4(TRACE_FOPS, 2, TRCID_CXISYSTEM_037,
         "cxiUiomove enter: kBufP 0x%lX uioP 0x%lX nBytes %d toKernel %d\n",
         kBufP, uioP, nBytes, toKernel);
  if (uioP->uio_resid <= 0)
    return ENOMEM;
  rc = 0;
  if (uioP->uio_iovcnt == 1)
  {
    /*
     * Fastpath for most common case of iovcnt == 1.  Saves a
     * few instructions.
     */
    iovP = uioP->uio_iov;
    cnt = iovP->iov_len;
    if (cnt <= 0)
    {
      uioP->uio_iovcnt--;
      uioP->uio_iov++;
      uioP->uio_iovdcnt++;
      return 0;
    }
    if (cnt > nBytes)
      cnt = nBytes;

    if (toKernel)
      /* inline for efficiency
         rc = cxiCopyIn((char *)iovP->iov_base, kBufP, cnt); */
      rc = __copy_from_user(kBufP, (char *)iovP->iov_base, cnt);
    else
      /* inline for efficiency
         rc = cxiCopyOut(kBufP, (char *)iovP->iov_base, cnt); */
      rc = __copy_to_user((char *)iovP->iov_base, kBufP, cnt);
    iovP->iov_base = (char *)iovP->iov_base + cnt;
    iovP->iov_len -= cnt;
    uioP->uio_resid -= cnt;
    uioP->uio_offset += cnt;
#ifdef TRACE_IO_DATA
    if (cnt >= sizeof(trcdata))
      memcpy(trcdata, origKBufP, sizeof(trcdata));
    else
    {
      memset(trcdata, 0xAA, sizeof(trcdata));
      memcpy(trcdata, origKBufP, cnt);
    }
    TRACE5(TRACE_FOPS, 3, TRCID_CXISYSTEM_039a,
           "uiomove exit 1: rc %d data %08X %08X %08X %08X\n",
           rc, trcdata[0], trcdata[1], trcdata[2], trcdata[3]);
#else
    TRACE1(TRACE_FOPS, 3, TRCID_CXISYSTEM_039,
           "uiomove exit 1: rc %d\n",
           rc);
#endif
    return rc;
  }
  while (nBytes > 0 && uioP->uio_resid && rc == 0)
  {
    if (uioP->uio_iovcnt <= 0)
      return ENOMEM;
    iovP = uioP->uio_iov;
    cnt = iovP->iov_len;
    if (cnt <= 0)
    {
      uioP->uio_iovcnt--;
      uioP->uio_iov++;
      uioP->uio_iovdcnt++;
      continue;
    }
    if (cnt > nBytes)
      cnt = nBytes;

    if (toKernel)
      /* inline for efficiency
         rc = cxiCopyIn((char *)iovP->iov_base, kBufP, cnt); */
      rc = __copy_from_user(kBufP, (char *)iovP->iov_base, cnt);
    else
      /* inline for efficiency
         rc = cxiCopyOut(kBufP, (char *)iovP->iov_base, cnt); */
      rc = __copy_to_user((char *)iovP->iov_base, kBufP, cnt);
    iovP->iov_base = (char *)iovP->iov_base + cnt;
    iovP->iov_len -= cnt;
    uioP->uio_resid -= cnt;
    uioP->uio_offset += cnt;
    kBufP += cnt;
    nBytes -= cnt;
  }
#ifdef TRACE_IO_DATA
  cnt = kBufP - origKBufP;
  if (cnt >= sizeof(trcdata))
    memcpy(trcdata, origKBufP, sizeof(trcdata));
  else
  {
    memset(trcdata, 0xAA, sizeof(trcdata));
    memcpy(trcdata, origKBufP, cnt);
  }
  TRACE5(TRACE_FOPS, 3, TRCID_CXISYSTEM_041a,
         "uiomove exit 2: rc %d data %08X %08X %08X %08X\n",
         rc, trcdata[0], trcdata[1], trcdata[2], trcdata[3]);
#else
  TRACE1(TRACE_FOPS, 3, TRCID_CXISYSTEM_041,
         "uiomove exit 2: rc %d\n",
         rc);
#endif
  return rc;
}


/* A routine to check that the definitions in our cxiTypes.h
 * files are equivalent to the system definitions.  The module
 * should not load if it receives an error from this routine.
 */
int
cxiCheckTypes()
{
#define OFFSET_OF(_field, _struct) ( (long) ( &((_struct *)0)->_field ) )
  int rc = 0;

  /* Make sure cxiBlockingMutex_t fits in the space provided.  If not,
     the implementation of the cxiBlockingMutex... routines needs to
     use the embedded space to record a pointer to kmalloc'ed space holding
     the semaphore. */
  if (sizeof(struct semaphore) > GPFS_LINUX_SEM_SIZE)
    rc = 1;

  if (rc > 0)
    TRACE1(TRACE_TASKING, 2, TRCID_CXISYSTEM_CHKTYPES,
           "cxiCheckTypes: system type mismatch on type number %d!\n", rc);
  return rc;
}

/* Routine to get current time of day in nanosecond format.
 */
int
cxiGetTOD(cxiTimeStruc_t *tsP)
{
  struct timeval tv;

  /* This call returns microseconds so we fudge it to nanoseconds */
  do_gettimeofday(&tv);
  tsP->tv_sec = tv.tv_sec;
  tsP->tv_nsec = tv.tv_usec * 1000;

  return 0;
}

Boolean
cxiIsNFSThread()
{
# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
    /* Note comparison against a multibyte character constant (not a string
      constant).  Order of characters in word is reversed due to little-
      endian representation of integers. */
    if (* ((int*)&current->comm[0]) != 0x6473666e) // 'dsfn'
       return false;
    return * ((char*)&current->comm[4]) == '\0';
# else
    return (strcmp(current->comm, "nfsd") == 0);
# endif
}

Boolean
cxiIsKupdateThread()
{
#if LINUX_KERNEL_VERSION >= 2040200
  return (strcmp(current->comm, "kupdated") == 0);
#else
  return (strcmp(current->comm, "kupdate") == 0);
#endif
}

#ifdef SMB_LOCKS
Boolean
cxiIsSambaOrLockdThread()
{
# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
    /* Note comparison against a multibyte character constant (not a string
      constant).  Order of characters in word is reversed due to little-
      endian representation of integers. */
    Boolean rc = (((* ((int*)&current->comm[0]) == 0x64626d73) &   // 'dbms'
                   (* ((char*)&current->comm[4]) == '\0'))    |
                  ((* ((int*)&current->comm[0]) == 0x6b636f6c) &   // 'kcol'
                   (* ((int*)&current->comm[2]) == 0x00646b63)));  // 'dkc'
       return rc;
# else
    return ((strcmp(current->comm, "smbd") == 0) |
            (strcmp(current->comm, "lockd") == 0));
# endif
}
#endif

#ifdef CCL
Boolean
cxiIsSambaThread()
{
# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
    /* Note comparison against a multibyte character constant (not a string
      constant).  Order of characters in word is reversed due to little-
      endian representation of integers. */
    Boolean rc = ((* ((int*)&current->comm[0]) == 0x64626d73) &  // 'dbms'
                  (* ((char*)&current->comm[4]) == '\0'));
       return rc;
# else
    return (strcmp(current->comm, "smbd") == 0);
# endif
}
#endif

Boolean
cxiIsGPFSThread()
{
# if defined(GPFS_LITTLE_ENDIAN) && !defined(__64BIT__)
  return (((* ((int*)&current->comm[0]) == 0x73666d6d) &  // 'sfmm'
           (* ((int*)&current->comm[2]) == 0x00647366))); // 'dsf'
# else
  return (strcmp(current->comm, "mmfsd") == 0);
# endif
}

/* Initialize a cxiBlockingMutex_t.  Instead of the DBGASSERT, this routine
   should kmalloc a struct semaphore if bmSem is too small.  */
void cxiBlockingMutexInit(cxiBlockingMutex_t* mP, int bmNameIdx)
{
  DBGASSERT(sizeof(struct semaphore) <= GPFS_LINUX_SEM_SIZE);
#ifdef INSTRUMENT_LOCKS
  DBGASSERT(bmNameIdx < MAX_GPFS_LOCK_NAMES);
#endif  /* INSTRUMENT_LOCKS */

  TRACE2(TRACE_KLOCKL, 3, TRCID_BM_INIT,
         "cxiBlockingMutexInit: mP 0x%lX idx %d\n",
         mP, bmNameIdx);
  init_MUTEX((struct semaphore *)mP->bmSem);
  mP->bmOwnerP = NULL;
  mP->lockNameIndex = bmNameIdx;
}


/* Enter critical section, blocking this thread if necessary.  Mark this
   thread as the owner of the mutex before returning. */
void cxiBlockingMutexAcquire(cxiBlockingMutex_t* mP)
{
  char dummy;
  UIntPtr currentOwner = ((UIntPtr)mP->bmOwnerP) & ~((UIntPtr)(THREAD_SIZE-1));

  TRACE4(TRACE_KLOCKL, 9, TRCID_BM_ACQ,
         "cxiBlockingMutexAcquire: about to acquire 0x%lX type %d "
         "current 0x%lX currentOwner 0x%lX\n",
         mP, mP->lockNameIndex, current, currentOwner);

  DBGASSERTRC(currentOwner != (UIntPtr)current, currentOwner, 
              PTR_TO_INT32(mP), 0);

#ifdef INSTRUMENT_LOCKS
  BlockingMutexStatsTable[mP->lockNameIndex].bmsAcquires += 1;
  if (mP->bmOwnerP != NULL)
    BlockingMutexStatsTable[mP->lockNameIndex].bmsConflicts += 1;
#endif
  down((struct semaphore *)mP->bmSem);
  mP->bmOwnerP = &dummy;
  TRACE1(TRACE_KLOCKL, 9, TRCID_BM_ACQ_EXIT,
         "cxiBlockingMutexAcquire: returning after acquiring 0x%lX\n", mP);
}


/* Leave critical section and awaken waiting threads */
void cxiBlockingMutexRelease(cxiBlockingMutex_t* mP)
{
  UIntPtr currentOwner = ((UIntPtr)mP->bmOwnerP) & ~((UIntPtr)(THREAD_SIZE-1));

  TRACE2(TRACE_KLOCKL, 9, TRCID_BM_REL,
         "cxiBlockingMutexRelease: about to release 0x%lX type %d\n",
         mP, mP->lockNameIndex);
  DBGASSERTRC(currentOwner == (UIntPtr)current, currentOwner, 
              PTR_TO_INT32(mP), 0);

  mP->bmOwnerP = NULL;
  up((struct semaphore *)mP->bmSem);
}


/* Free resources associated with this cxiBlockingMutex_t in preparation
   for freeing the storage it occupies */
void cxiBlockingMutexTerm(cxiBlockingMutex_t* mP)
{
  TRACE2(TRACE_KLOCKL, 3, TRCID_BM_TERM,
         "cxiBlockingMutexTerm: mP 0x%lX type %d\n", mP, mP->lockNameIndex);

  /* Verify that mutex is not held */
  DBGASSERT(mP->bmOwnerP == NULL);
  DBGASSERT(atomic_read(&((struct semaphore *)mP->bmSem)->count) == 1);
}


/* Return true if a cxiBlockingMutex_t is held by the calling process */
Boolean cxiBlockingMutexHeldByCaller(cxiBlockingMutex_t* mP)
{
  Boolean result;
  char* ownerP;
  cxiPid_t ownerPid;

  /* Cache bmOwnerP is case it changes to NULL */
  ownerP = mP->bmOwnerP;
  if (ownerP == NULL)
    result = false;
  else
  {
    cxiStackAddrToThreadId(ownerP, &ownerPid);
    result = (current->pid == ownerPid);
  }
  TRACE2(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_017,
         "cxiBlockingMutexHeldByCaller: owner 0x%lX returns %d\n",
         ownerP, result);
  return result;
}


/* Return true if a cxiBlockingMutex_t has one or more processes waiting
   on it */
Boolean cxiBlockingMutexHasWaiters(cxiBlockingMutex_t* mP)
{
  struct semaphore * semP = (struct semaphore *)mP->bmSem;
  Boolean result;

  if ((void*)semP->wait.task_list.next != (void*)&semP->wait.task_list.next)
    result = true;
  else
    result = false;
  TRACE2(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_018,
         "cxiBlockingMutexHasWaiters: mP 0x%lX hasWaiters %d\n",
         mP, result);
}


/* Wait for a cxiWaitEventSignal, cxiWaitEventBroadcast, or
   cxiWaitEventBroadcastRC.  Drop the associated cxiBlockingMutex_t
   *mutexP while waiting, and reacquire it before returning.
   If INTERRUPTIBLE is set in waitFlags, waits interruptibly;
   otherwise, waits uninterruptibly.
     Returns THREAD_INTERRUPTED if interrupted before being woken up,
   THREAD_AWAKENED, if woken up by cxiWaitEventSignal or
   cxiWaitEventBroadcast, or the result value passed to
   cxiWaitEventWakeupResult, if woken up by cxiWaitEventWakeupResult. */
int cxiWaitEventWait(cxiWaitEvent_t* weP, cxiBlockingMutex_t* mutexP,
                     int waitFlags)
{
  spinlock_t *lockP = (spinlock_t *)(weP->lword);
  unsigned long flags;
  cxiWaitElement_t waitElement;
  UIntPtr currentOwner;
  int count = 0;
  Boolean done;

  TRACE3(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_EVENT_WAIT_ENTER,
         "cxiWaitEventWait enter: weP 0x%lX waitFlags 0x%X about to release "
         "mutex 0x%lX \n", weP, waitFlags, mutexP);

  /* Verify that caller is holding the mutex */
  currentOwner = ((UIntPtr)mutexP->bmOwnerP) & ~(THREAD_SIZE-1);
  DBGASSERTRC(currentOwner == (UIntPtr)current, currentOwner, 
              PTR_TO_INT32(mutexP), 0);

  /* initialize our wait element */
  init_waitqueue_head(&waitElement.qhead);
  init_waitqueue_entry(&waitElement.qwaiter, current);
  __add_wait_queue(&waitElement.qhead, &waitElement.qwaiter);
  waitElement.wakeupRC = 0;

  /* update our task state to not running any more */
  if (waitFlags & INTERRUPTIBLE)
    current->state = TASK_INTERRUPTIBLE;
  else
    current->state = TASK_UNINTERRUPTIBLE;

  /* add our wait element to the end of the wait list */
  wq_write_lock_irqsave(lockP, flags);
  CXI_WAIT_LIST_ADD(&weP->waitList, &waitElement.waitList);
  wq_write_unlock_irqrestore(lockP, flags);

  /* Release the mutex.  Note: calling cxiBlockingMutexRelease here is
     problematic, because it makes trace calls, which may block the current
     process, which would overwrite the task state (current->state) we just
     updated.  A way around this would be to move out task state update to
     after the call to cxiBlockingMutexRelease, but then, before calling
     schedule(), we would have to re-acquire the wait-list lock and check
     wakeupRC to see whether somebody has already woken us up since we
     released the mutex.  Since there is a trace at the top of this routine,
     we don't need the one in cxiBlockingMutexRelease; hence, just do the
     release right here. */
  mutexP->bmOwnerP = NULL;
  up((struct semaphore *)mutexP->bmSem);

again:
  /* call the scheduler */
  schedule();

  /* Remove ourself from the wait list ... except:
     It appears that kswapd is special, and even though it waits
     uninterruptibly, the page_alloc code may call wakeup_kswapd().
     This causes a unexpected premature wake up in this code.  Perhaps we
     should hand off kswapd's work to another thread so that it won't ever
     block in GPFS???  It could possibly make better forward progress
     elsewhere in the kernel.  The count of 50 is an arbitrary number to allow
     us out and assert.  Hopefully the page allocation won't ever get in that
     much trouble. */
  wq_write_lock_irqsave(lockP, flags);
  if (waitElement.wakeupRC == 0 &&
      !(waitFlags & INTERRUPTIBLE) &&
      cxiIsKswapdThread() &&
      count++ < 50)
  {
    current->state = TASK_UNINTERRUPTIBLE;
    done = false;
  }
  else
  {
    CXI_WAIT_LIST_REMOVE(&waitElement.waitList);
    done = true;
  }
  wq_write_unlock_irqrestore(lockP, flags);

  if (!done)
    goto again;

  /* re-acquire the mutex */
  cxiBlockingMutexAcquire(mutexP);

  TRACE3(TRACE_KLOCKL, 9, TRCID_CXISYSTEM_EVENT_WAIT_EXIT,
         "cxiWaitEventWait exit: weP 0x%lX mutexP 0x%lX rc %d\n",
         weP, mutexP, waitElement.wakeupRC);

  /* A zero wakeup code means we were interrupted rather than woken up */
  if (waitElement.wakeupRC != 0)
    return waitElement.wakeupRC;
  else
    return THREAD_INTERRUPTED;
}

/* Wake up one thread waiting on this cxiWaitEvent_t */
void
cxiWaitEventSignal(cxiWaitEvent_t* weP)
{
  TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_SIGNAL,
         "cxiWaitEventSignal: weP 0x%lX\n", weP);

  cxiWakeup(weP, false, THREAD_AWAKENED); /* wake up one */
}


/* Wake up all threads waiting on this cxiWaitEvent_t */
void
cxiWaitEventBroadcast(cxiWaitEvent_t* weP)
{
  TRACE1(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_BROADCAST,
         "cxiWaitEventBroadcastRC: weP 0x%lX\n", weP);

  cxiWakeup(weP, true, THREAD_AWAKENED); /* wake up all */
}


/* Wake up all threads waiting on this cxiWaitEvent_t and cause them to
   return rc from their cxiWaitEventWait calls. */
void
cxiWaitEventBroadcastRC(cxiWaitEvent_t* weP, int rc)
{
  TRACE2(TRACE_KLOCKL, 3, TRCID_CXISYSTEM_BROADCAST_RC,
         "cxiWaitEventBroadcastRC: weP 0x%lX rc %d\n", weP, rc);

  cxiWakeup(weP, true, rc);  /* wake up all */
}

/* alloc big memory area */
void *
cxiBigMalloc(int size)
{
  return (vmalloc(size));
}

/* free big memory area */
void
cxiBigFree(char *ptr)
{
  vfree(ptr);
}

#ifdef SMB_LOCKS
/* Exchange control between Samba, GPFS */
int
cxiSMBOpenLockControl(struct file *fP, int command, int lockmode)
{
  struct inode *iP;
  struct gpfsVfsData_t *privVfsP;
  struct MMFSVInfo *vinfoP;


/* check if inode belongs to GPFS */
#define GPFS_TYPE(IP) (!(iP) ? false : \
                        (!(iP->i_sb) ? false : \
                          (!(iP->i_sb->s_type) ? false : \
                            !strcmp(iP->i_sb->s_type->name, "gpfs"))))

  int rc = EINVAL;

  TRACE2(TRACE_VNODE, 9, TRCID_SMBOPENLOCKCONTROL_1,
         "cxiSMBOpenLockControl entry - command %d, mode 0x%x",
         command, lockmode);

  vinfoP = (struct MMFSVInfo *)fP->private_data;

  iP = fP->f_dentry->d_inode;
  DBGASSERT(iP != NULL);

  if (!GPFS_TYPE(iP))
  {
    rc = 0;
    goto xerror;
  }

  privVfsP = VP_TO_PVP(iP);

  rc = gpfs_ops.SMBOpenLockControl(command, lockmode, iP->i_ino, vinfoP, 
                                   privVfsP);

xerror:

  TRACE1(TRACE_VNODE, 1, TRCID_SMBOPENLOCKCONTROL_EXIT,
         "cxiSMBOpLockControl exit: rc %d\n", rc);

 return (rc);
}


/* Exchange control between Samba, GPFS */
int
cxiSMBOpenLockMove(struct file *fP, int command, int lockmode, int nodenumber)
{
  struct inode *iP;
  struct gpfsVfsData_t *privVfsP;
  struct MMFSVInfo *vinfoP;


/* check if inode belongs to GPFS */
#define GPFS_TYPE(IP) (!(iP) ? false : \
                        (!(iP->i_sb) ? false : \
                          (!(iP->i_sb->s_type) ? false : \
                            !strcmp(iP->i_sb->s_type->name, "gpfs"))))

  int rc = EINVAL;

  TRACE3(TRACE_VNODE, 9, TRCID_SMBOPENLOCKMOVE_1,
         "cxiSMBOpenLockMove entry - command %d, mode 0x%x, origin node %d",
         command, lockmode, nodenumber);

  vinfoP = (struct MMFSVInfo *)fP->private_data;

  iP = fP->f_dentry->d_inode;
  DBGASSERT(iP != NULL);

  if (!GPFS_TYPE(iP))
  {
    rc = 0;
    goto xerror;
  }

  privVfsP = VP_TO_PVP(iP);

  rc = gpfs_ops.SMBOpenLockMove(command, lockmode, nodenumber, iP->i_ino, 
                                vinfoP, privVfsP);

xerror:

  TRACE1(TRACE_VNODE, 1, TRCID_SMBOPENLOCKMOVE_EXIT,
         "cxiSMBOpLockMove exit: rc %d\n", rc);

 return (rc);
}

/* Determine if current process has this file open */
void *
cxiCheckOpen(struct cxiNode_t* cnP)
{
  int count;
  int i;
  struct file** fdList;
  struct file*  fileP;
  struct inode* inodeP;

  count = current->files->max_fds;
  fdList = current->files->fd;
  inodeP = GNP_TO_VP(cnP);

  TRACE3(TRACE_VNODE,9,TRCID_CXICHECKOPEN_ENTRY,
         "cxiCheckOpen: entry.  %d files in fd list. Checking for inode %d "
         "at 0x%x", count, inodeP->i_ino, inodeP);

  for (i=0; i<count; i++)
  {
    fileP = fdList[i];

    if (fileP)
    {
      if (fdList[i]->f_dentry->d_inode == inodeP)
      {
        TRACE1(TRACE_VNODE, 9,TRCID_CXICHECKOPEN_FOUND,
               "cxiCheckOpen: found open file. vinfoP 0x%x",
               fileP->private_data);
        return fileP->private_data;
      }
    }
  }

  return NULL;
}
#endif


/* Get the address of the first byte not addressible by processes */
UIntPtr cxiGetKernelBoundary()
{
  return (UIntPtr) PAGE_OFFSET;
}


/* Return true if this process holds the big kernel lock (BKL) */
Boolean cxiHoldsBKL()
{
  return current->lock_depth >= 0;
}


/* Yield the CPU to allow other processes to run */
void
cxiYield()
{
  schedule();
}

/* Linux filldir has changed signatures depending on kernel level.
 * We always pass a 64bit offset from the GPFS layer.
 */
int
cxiFillDir(void *vargP, const char *nameP, int namelen, 
           offset_t offset, ino_t ino)
{
  cxiFillDirArg_t *fillDirArgP = (cxiFillDirArg_t *)vargP;
  filldir_t fnP = (filldir_t)fillDirArgP->fnP;

#if LINUX_KERNEL_VERSION >= 2040900
  return (*fnP)(fillDirArgP->argP, nameP, namelen, 
                (loff_t)offset, ino, 0 /* DT_UNKNOWN */);
#else
  return (*fnP)(fillDirArgP->argP, nameP, namelen, 
                (off_t)offset, ino, 0 /* DT_UNKNOWN */);
#endif
}
